library(dplyr)
library(data.table)
library(leaflet)
library(car)
library(RColorBrewer)
library(geojsonio)
library(sp)
library(pedometrics)
library(leafsync)
knitr::opts_knit$set(root.dir = normalizePath("../analysis/data/"))
path <- knitr::opts_knit$get("root.dir")
df <- read.csv(paste0(path, '/final_data.csv'))
head(df[order(df$num_props),])
## GEOID num_props median_value pct_owned BEDROOMS BATHS BLDG_AREA
## 192 55079120300 2 139950 0.5000000 3.500000 1.000000 3361.500
## 206 55079186900 4 863450 0.5000000 5.250000 3.750000 10741.000
## 193 55079120400 5 138700 1.0000000 3.200000 1.400000 3025.400
## 190 55079090200 6 225150 0.8333333 3.500000 1.833333 4618.500
## 191 55079110100 15 95000 0.8000000 2.800000 1.066667 2508.800
## 110 55079011300 21 168900 0.5238095 2.809524 1.333333 2818.619
## LOT_AREA AIR_CONDITIONING garage vacant pct_residential
## 192 7130.500 1.0000000 1.0000000 0.000000000 1.0000000
## 206 11215.000 0.5000000 0.5000000 0.000000000 0.8384432
## 193 5400.000 0.8000000 1.0000000 0.000000000 1.0000000
## 190 8688.333 1.0000000 1.0000000 0.000000000 1.0000000
## 191 4981.733 0.5333333 0.7333333 0.000000000 0.4128271
## 110 2886.238 0.2857143 0.4285714 0.004587156 0.5751411
## pct_single_fam median_hh_income population pct_white pct_black pct_hisp
## 192 0.47738408 60000 2102 75.1 3.3 20.0
## 206 0.02374724 53345 2353 84.8 3.2 1.9
## 193 0.16097691 55365 6810 73.5 4.7 14.9
## 190 1.00000000 82368 1926 78.9 8.5 2.3
## 191 0.24015316 35286 4200 48.7 17.3 29.9
## 110 0.01850200 43077 2052 80.3 9.9 4.8
## pct_asian pct_coll_plus ALAND pop_density Homicide other_violent_crime
## 192 0.0 29.4 885487 23.738 0 0
## 206 8.1 72.3 427986 54.978 0 16
## 193 2.5 23.3 4795004 14.202 0 12
## 190 8.5 57.9 3817170 5.046 0 0
## 191 2.9 19.6 2897237 14.497 0 5
## 110 3.1 66.8 433646 47.320 1 105
## property_crime violent_crime_trend
## 192 1 0
## 206 193 -2
## 193 12 -4
## 190 1 0
## 191 20 -1
## 110 678 8
### Four of these tracts have a single digit number of properties, and including
### some of them in initial models (particularly the outlier wiht a median value
### of $863K) was influencing the model, so let's remove them
df <- df[df$num_props > 10,]
### visualize the relationships to get a better sense of the relationship of
### each variable with median value
par(mfrow=c(8, 3), mar=c(2,2,2,2))
y <- df$median_value
for (i in 4:26)
{
title <- colnames(df[i])
plot(df[,i], y, main=title, xlab=title, ylab='median value')
abline(lm(y~df[,i]), col='red')
}
### In looking at the above plots, vacancy, homicide and violent crime have a clear,
### negative relationship with home values, but only up to a certain point, so we'll
### experiment with creating log versions of those fields
## Because the log transformations can create Inf values, we'll first set fields with
## values of zero to be negligibly positive
df$vacant[df$vacant== 0] <- .0001
df$vacLog <- log(df$vacant)
### because homicides and "other" violent crime are closely correlated and have very
### similar relationships with median home value, let's simplify this by combining
### them into one field
df$violentCrime <- df$Homicide + df$other_violent_crime
df$violentCrime[df$violentCrime== 0] <- .0001
df$violentCrimeLog <- log(df$violentCrime)
## Do the log versions of these fields have a stronger relationship with home values? Yes.
sprintf("vacancy: %s; vacLog: %s", cor(df$vacant, df$median_value)[1],
cor(df$vacLog, df$median_value)[1])
## [1] "vacancy: -0.540622789490693; vacLog: -0.710353939500846"
sprintf("violent crime: %s; log violent crime: %s", cor(df$violentCrime, df$median_value)[1],
cor(df$violentCrimeLog, df$median_value)[1])
## [1] "violent crime: -0.548009059130518; log violent crime: -0.584100423375757"
### let's look at correlations bewteen median value and potential features
cors <- sort(cor(df[3:29])[,1], decreasing=TRUE)
cors
## median_value pct_coll_plus pct_white BATHS
## 1.00000000 0.79627323 0.73345942 0.71283848
## BLDG_AREA median_hh_income pct_owned AIR_CONDITIONING
## 0.70649176 0.69266722 0.63483912 0.47054985
## garage LOT_AREA BEDROOMS population
## 0.31229880 0.30921344 0.24793719 0.18172322
## pct_residential ALAND pct_single_fam pop_density
## 0.15833838 0.14995737 0.11452208 0.02989415
## pct_asian violent_crime_trend pct_hisp property_crime
## -0.03525563 -0.05985659 -0.11785635 -0.13047943
## Homicide vacant pct_black other_violent_crime
## -0.48931972 -0.54062279 -0.54642153 -0.54694953
## violentCrime violentCrimeLog vacLog
## -0.54800906 -0.58410042 -0.71035394
mod1 <- lm(median_value~BLDG_AREA+BATHS+pct_coll_plus+pct_white
+vacLog+pct_black+garage+BEDROOMS+property_crime+I(BATHS*BLDG_AREA)
+I(BATHS*BEDROOMS)+I(BLDG_AREA*BEDROOMS)
+I(pct_black*pct_coll_plus)+I(pct_black*vacLog)
+I(pct_white*property_crime)+violentCrimeLog
+I(pct_black*violentCrimeLog)
, data = df)
summary(mod1)
##
## Call:
## lm(formula = median_value ~ BLDG_AREA + BATHS + pct_coll_plus +
## pct_white + vacLog + pct_black + garage + BEDROOMS + property_crime +
## I(BATHS * BLDG_AREA) + I(BATHS * BEDROOMS) + I(BLDG_AREA *
## BEDROOMS) + I(pct_black * pct_coll_plus) + I(pct_black *
## vacLog) + I(pct_white * property_crime) + violentCrimeLog +
## I(pct_black * violentCrimeLog), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -47578 -7286 758 7498 36792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.604e+05 5.286e+04 -3.033 0.002765 **
## BLDG_AREA -4.259e-01 2.848e+01 -0.015 0.988085
## BATHS -2.281e+04 6.496e+04 -0.351 0.725853
## pct_coll_plus 1.317e+03 1.723e+02 7.647 1.09e-12 ***
## pct_white -1.581e+01 1.505e+02 -0.105 0.916419
## vacLog -2.173e+03 1.859e+03 -1.169 0.243939
## pct_black -2.001e+03 6.234e+02 -3.210 0.001564 **
## garage 2.114e+04 7.721e+03 2.738 0.006779 **
## BEDROOMS 1.346e+05 2.797e+04 4.812 3.09e-06 ***
## property_crime -2.576e+01 1.206e+01 -2.136 0.033992 *
## I(BATHS * BLDG_AREA) 9.937e+01 1.310e+01 7.587 1.55e-12 ***
## I(BATHS * BEDROOMS) -7.266e+04 2.576e+04 -2.821 0.005312 **
## I(BLDG_AREA * BEDROOMS) -2.388e+01 6.283e+00 -3.801 0.000196 ***
## I(pct_black * pct_coll_plus) -1.984e+01 4.414e+00 -4.495 1.22e-05 ***
## I(pct_black * vacLog) -1.510e+02 3.966e+01 -3.809 0.000190 ***
## I(pct_white * property_crime) 8.165e-01 2.407e-01 3.392 0.000848 ***
## violentCrimeLog -4.442e+03 3.805e+03 -1.167 0.244622
## I(pct_black * violentCrimeLog) 2.256e+02 9.057e+01 2.491 0.013611 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13900 on 185 degrees of freedom
## Multiple R-squared: 0.9654, Adjusted R-squared: 0.9623
## F-statistic: 304 on 17 and 185 DF, p-value: < 2.2e-16
sprintf('AIC: %s', AIC(mod1))
## [1] "AIC: 4468.41899606828"
### The residuals are normally distributed, and it doesn't look like
### any of the observations are too influential
hist(resid(mod1))
par(mfrow=c(2,2))
plot(mod1)
### add predicted values and residuals to the df so that we can visualize them
df$preds = round(predict(mod1), 0)
df$residuals = round(resid(mod1), 0)
### navigate to geo folder
knitr::opts_knit$set(root.dir = normalizePath('../../geo/'))
# knitr::opts_knit$set(root.dir = normalizePath('../geo/'))
geo_path <- knitr::opts_knit$get("root.dir")
tracts <- geojson_read(paste0(geo_path, '/mke_county_tract.geojson'), what = "sp")
### merge df and geo obj, inner join only
tracts_df <- merge(tracts, df, by='GEOID', all=FALSE)
m1Bins <- c(min(tracts_df$residuals), -35000, -25000, -15000, -5000, 5000,
15000, 25000, 35000, max(tracts_df$residuals))
m1Pal <- colorBin('RdYlGn', domain = tracts_df$residuals, bins = m1Bins, reverse = TRUE)
### Create labels for hovering
m1Labels <- sprintf(
'<strong> %s</strong><br/>Expected value: <strong>$%g</strong><br/>
Actual value: <strong>$%g</strong><br/>Difference: <strong>$%g</strong>',
tracts_df$NAMELSAD, tracts_df$preds,
tracts_df$median_value, tracts_df$residuals
) %>% lapply(htmltools::HTML)
m1 <- leaflet(tracts_df) %>%
setView(lat=43.060174, lng=-87.925549, zoom = 11) %>%
addTiles() %>%
addPolygons(
fillColor = ~m1Pal(tracts_df$residuals),
weight = 1,
color = 'white',
opacity = 0.75,
fillOpacity = 0.75,
### add hover-over capability
highlight = highlightOptions(
weight = 5,
color = "#666",
fillOpacity = 0.7,
bringToFront = TRUE),
### adding labels for hover tool
label = m1Labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "11px",
direction = "auto")
) %>%
### add legend
addLegend(pal = m1Pal, values = ~density, opacity = 0.7,
title = 'Expected minus<br>Actual Value',
position = "topright")
### map for actual values only
m2Bins <- c(min(tracts_df$median_value), 30000, 60000, 90000, 120000, 150000,
180000, 210000, 240000, max(tracts_df$median_value))
m2Ppal <- colorBin('Blues', domain = tracts_df$median_value, bins = m2Bins)
### Create labels for hovering
m2Labels <- sprintf(
'<strong> %s</strong><br/>Median value: <strong>$%g</strong>',
tracts_df$NAMELSAD,tracts_df$median_value
) %>% lapply(htmltools::HTML)
m2 <- leaflet(tracts_df) %>%
setView(lat=43.060174, lng=-87.925549, zoom = 11) %>%
addTiles() %>%
addPolygons(
fillColor = ~m2Ppal(tracts_df$median_value),
weight = 1,
color = 'white',
opacity = 0.75,
fillOpacity = 0.75,
### add hover-over capability
highlight = highlightOptions(
weight = 5,
color = "#666",
fillOpacity = 0.7,
bringToFront = TRUE),
### adding labels for hover tool
label = m2Labels,
labelOptions = labelOptions(
style = list("font-weight" = "normal", padding = "3px 8px"),
textsize = "11px",
direction = "auto")
) %>%
### add legend
addLegend(pal = m2Ppal, values = ~density, opacity = 0.7,
title = 'Median value',
position = "topright")
In the below plots, we can see that one neighborhood on the west side of the city really stands out for having an actual median value that is nearly $50K below what the model expects the value to be. Hover over neighborhoods for details.
latticeview(m1, m2, ncol=2)
### To be able to make comparisons between different variables, first scale the data
### (excluding GEOID and median value)
scaled <- as.data.frame(scale(subset(df, select=-c(GEOID, median_value))))
### join back in geo and median val cols
scaled <- cbind(subset(df, select=c(GEOID, median_value)), scaled)
### final simple model
mod2 <- lm(median_value~BATHS+median_hh_income+pct_black+
pct_coll_plus+property_crime+vacLog,
data = scaled)
summary(mod2)
##
## Call:
## lm(formula = median_value ~ BATHS + median_hh_income + pct_black +
## pct_coll_plus + property_crime + vacLog, data = scaled)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57609 -13137 2045 10896 54836
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 106690 1493 71.461 < 2e-16 ***
## BATHS 34478 1786 19.300 < 2e-16 ***
## median_hh_income 13187 2339 5.637 5.96e-08 ***
## pct_black -10347 2020 -5.122 7.20e-07 ***
## pct_coll_plus 15422 2343 6.583 4.13e-10 ***
## property_crime 5196 1576 3.297 0.00116 **
## vacLog -20373 2182 -9.338 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 21270 on 196 degrees of freedom
## Multiple R-squared: 0.9143, Adjusted R-squared: 0.9117
## F-statistic: 348.5 on 6 and 196 DF, p-value: < 2.2e-16
### And we can see that multicolinearity is fairly low
vif(mod2)
## BATHS median_hh_income pct_black pct_coll_plus
## 1.424669 2.442891 1.821536 2.450280
## property_crime vacLog
## 1.108642 2.125138
## residuals are normally distributed, though there are a few on the low end that stand out
hist(resid(mod2))
## no obvservations are too skewed or are having too much influence over the model
par(mfrow = c(2,2))
plot(mod2)
In the below plot, which shows the influence of each variable in the model, we can see that the average number of bathrooms – which it’s worth noting is strongly correlated with average home size, lot size and the average number of bedrooms – has the greatest influence on average home values in a neighborhood. The model also finds that even when controlling for several other factors, neighborhoods with larger black populations have lower property values, though the influence of that variable is smaller than most other variables. And while it’s hard to imagine that other fields, such as say, violent crime, don’t have a significant effect on property values, the effect of them is difficult to interpret because adding them quickly produces high multicolinearity within the model.
par(mar = c(4,6,1,1))
barplot(sort(coef(mod2)[2:7]), xlab = 'Coefficients',
horiz = T, las=2, col='blue',
names.arg=c("Vacancy", "% Black", "Prop Crime", "Income",
"% Coll. deg.", "Baths"))
abline(v=0)